// CuNNy 3x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N03
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0

#define l0(x, y) (dot(MF3(-2.683e-01, -5.217e-01, -1.382e-01), O(INPUT, float2(x, y)).rgb) + MF(7.973e-01))

V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
	V4 r = { 3.156e-02, 7.379e-02, 1.078e-02, -5.510e-04 };
	r = mad(s0_0, V4(1.850e-01, -2.860e-02, -5.321e-01, 2.390e-03), r);
	r = mad(s0_1, V4(-4.299e-01, -2.946e-02, -1.180e-01, -5.652e-02), r);
	r = mad(s0_2, V4(-4.798e-01, -2.276e-02, 3.201e-02, 4.870e-02), r);
	r = mad(s0_3, V4(2.783e-01, -2.262e-03, -1.864e-01, 1.793e-01), r);
	r = mad(s0_4, V4(9.435e-04, 8.115e-01, 7.806e-01, -7.793e-01), r);
	r = mad(s0_5, V4(2.180e-01, -2.564e-05, 2.774e-03, -7.015e-02), r);
	r = mad(s0_6, V4(1.479e-03, -4.675e-02, 3.323e-02, 3.392e-01), r);
	r = mad(s0_7, V4(1.203e-01, 1.509e-02, 5.239e-02, 3.194e-01), r);
	r = mad(s0_8, V4(7.680e-02, -4.310e-02, -7.203e-02, 1.255e-02), r);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	MF s0_0 = l0(-1.0, -1.0);
	MF s0_1 = l0(0.0, -1.0);
	MF s0_2 = l0(1.0, -1.0);
	MF s0_3 = l0(-1.0, 0.0);
	MF s0_4 = l0(0.0, 0.0);
	MF s0_5 = l0(1.0, 0.0);
	MF s0_6 = l0(-1.0, 1.0);
	MF s0_7 = l0(0.0, 1.0);
	MF s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { 1.427e-02, -1.982e-02, 4.114e-03, -2.883e-02 };
	r = MulAdd(s0_0, M4(1.949e-01, -1.247e-01, -7.307e-02, 8.783e-02, -4.773e-02, 6.012e-02, 8.043e-02, -8.489e-02, 6.760e-02, -7.809e-02, -4.745e-02, -1.304e-02, -1.402e-01, -1.248e-01, 3.334e-01, -1.498e-01), r);
	r = MulAdd(s0_1, M4(7.053e-02, 9.895e-02, 1.655e-01, 2.251e-01, 3.511e-02, -1.010e-01, -2.736e-01, 1.174e-01, -2.551e-01, 1.100e-01, 1.518e-01, -4.343e-02, -9.293e-01, 5.327e-01, -2.723e-01, 4.006e-01), r);
	r = MulAdd(s0_2, M4(-2.390e-02, 8.154e-03, -2.332e-02, -3.708e-02, 2.814e-02, 5.506e-02, -2.627e-01, -8.081e-02, -1.062e-01, -6.819e-02, -9.498e-02, -2.749e-01, -2.457e-01, 6.868e-01, 6.527e-03, 7.676e-01), r);
	r = MulAdd(s0_3, M4(2.704e-01, 4.055e-02, -4.756e-01, 2.506e-01, -9.498e-02, 5.838e-02, 1.733e-01, 3.420e-03, -7.051e-02, -8.233e-02, -3.006e-01, 6.824e-02, -1.308e-01, 1.196e-01, 2.560e-01, 8.304e-02), r);
	r = MulAdd(s0_4, M4(4.190e-01, -1.207e-01, 2.708e-01, -6.375e-01, 1.740e-01, 1.955e-03, -1.816e-01, -7.933e-02, -9.308e-01, 1.333e-01, -1.335e-01, -1.401e-01, 3.447e-01, 3.389e-01, 6.660e-01, -3.387e-01), r);
	r = MulAdd(s0_5, M4(7.310e-02, 1.403e-02, 8.114e-02, 7.400e-02, -2.552e-02, -1.607e-01, -1.208e-01, -3.943e-02, -2.743e-02, -7.229e-03, -1.749e-03, 3.062e-01, 1.429e-01, 8.105e-01, 3.562e-01, 4.580e-01), r);
	r = MulAdd(s0_6, M4(2.115e-01, -1.686e-01, -1.948e-01, -1.191e-01, -5.798e-02, 3.493e-02, 8.264e-02, 1.579e-01, -1.081e-01, -1.775e-01, -8.196e-02, -2.085e-01, 6.791e-02, 1.652e-02, -4.933e-03, 2.833e-02), r);
	r = MulAdd(s0_7, M4(-2.160e-01, -3.858e-01, -8.407e-01, -1.091e-01, 8.415e-03, 8.626e-02, 2.340e-01, 9.177e-02, -4.697e-01, -6.623e-02, -5.176e-01, 6.762e-02, -3.437e-03, 6.570e-02, 7.630e-02, 8.988e-02), r);
	r = MulAdd(s0_8, M4(6.527e-02, -6.320e-02, 1.192e-02, -1.196e-01, -1.605e-02, -9.294e-03, 1.955e-01, -2.356e-02, -3.582e-02, 1.377e-02, 9.253e-02, -2.362e-02, 3.578e-02, 1.822e-01, 3.329e-01, 1.489e-01), r);
	r = MulAdd(s1_0, M4(1.154e-01, -1.822e-01, -2.122e-01, 3.031e-02, 6.550e-01, -4.855e-02, 6.554e-02, 4.432e-02, 1.671e-02, -4.477e-02, -9.428e-03, 4.413e-03, -3.185e-02, -1.529e-01, -1.222e-01, 6.523e-02), r);
	r = MulAdd(s1_1, M4(-4.920e-02, -1.697e-02, 4.141e-02, 1.997e-01, 6.972e-01, -5.157e-01, 2.031e-01, 2.829e-02, -5.005e-02, 2.335e-01, 2.985e-01, 6.871e-02, -5.232e-01, 2.146e-02, -1.418e+00, 2.193e-01), r);
	r = MulAdd(s1_2, M4(-6.472e-02, 2.595e-02, -2.610e-02, -2.279e-02, 4.165e-01, -7.745e-01, 1.261e-01, -3.845e-01, 3.279e-02, 2.445e-02, 1.796e-01, -2.581e-01, -3.838e-01, 6.280e-02, -4.893e-01, -1.475e-01), r);
	r = MulAdd(s1_3, M4(9.330e-02, 1.742e-01, -1.685e-01, 2.376e-02, -9.586e-01, -1.236e+00, -7.271e-01, -7.674e-01, 2.500e-01, -3.709e-02, -1.303e-01, 1.490e-01, -2.746e-01, -1.376e-01, -2.321e-02, -1.967e-02), r);
	r = MulAdd(s1_4, M4(3.660e-01, 4.772e-02, 5.524e-01, -2.804e-01, -2.756e+00, -1.336e+00, 2.038e-01, 2.593e+00, 2.156e-01, 3.281e-01, 3.152e-01, 8.064e-01, 3.970e-01, -1.379e-01, -7.518e-02, -2.723e-01), r);
	r = MulAdd(s1_5, M4(5.214e-03, 1.695e-02, 1.024e-01, 1.333e-01, -2.250e-01, -1.298e+00, 4.673e-01, 1.317e+00, 3.036e-01, -1.273e-01, 2.900e-01, 2.249e-02, -1.870e-01, -1.124e-01, -5.879e-01, 6.314e-02), r);
	r = MulAdd(s1_6, M4(-8.225e-02, -1.149e-01, 1.598e-04, -3.662e-01, -8.572e-02, -8.909e-01, 9.891e-02, 1.818e-01, 1.715e-01, -2.348e-01, 1.178e-01, -6.289e-02, 1.522e-02, 1.973e-02, 3.707e-02, 2.911e-02), r);
	r = MulAdd(s1_7, M4(-6.380e-02, 8.661e-02, -2.666e-01, 9.586e-02, -1.257e+00, -2.231e+00, -1.232e+00, 5.642e-01, 5.730e-02, -3.294e-01, -1.151e-01, 2.382e-01, 4.529e-02, 4.927e-02, 9.893e-02, 8.365e-02), r);
	r = MulAdd(s1_8, M4(1.906e-02, -8.920e-02, 8.931e-02, -6.752e-02, -3.680e-01, -1.282e+00, -1.388e-01, -7.545e-02, 6.262e-02, -1.695e-01, 2.278e-01, -3.066e-01, -7.412e-02, 1.145e-02, 4.667e-02, -4.205e-04), r);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { -1.292e-02, 8.156e-04, -2.055e-03, -3.100e-03 };
	r = MulAdd(s0_0, M4(2.965e-01, -1.919e-01, 9.202e-02, 8.775e-03, -4.948e-02, 1.061e-01, -3.754e-02, -1.900e-01, -2.114e-01, 1.267e-01, 1.989e-02, 2.570e-02, 4.634e-03, -2.718e-01, 2.171e-01, 1.512e-01), r);
	r = MulAdd(s0_1, M4(-5.527e-01, -4.825e-01, 4.325e-01, 4.447e-01, -6.577e-02, 5.161e-01, 3.286e-02, -3.800e-01, 2.625e-02, 3.835e-01, -7.794e-02, -5.489e-02, -2.647e-01, -4.952e-01, 1.587e-01, 1.471e-01), r);
	r = MulAdd(s0_2, M4(-3.687e-01, -1.096e-01, 1.849e-01, -6.915e-02, 2.257e-01, 2.760e-01, -8.875e-02, -8.871e-02, -8.394e-02, -6.714e-02, 5.322e-03, -3.252e-01, -7.885e-02, -2.723e-01, 6.149e-02, 2.998e-01), r);
	r = MulAdd(s0_3, M4(1.606e-01, -1.199e-01, 3.573e-01, 2.833e-02, 6.514e-03, -2.242e-02, -6.231e-02, 6.702e-02, -8.717e-02, -2.227e-01, -1.626e-01, 5.313e-02, -1.411e-01, -2.445e-02, 1.194e-01, -1.101e-01), r);
	r = MulAdd(s0_4, M4(-1.127e+00, 1.823e-01, 1.358e-01, -1.618e-01, -4.171e-04, -7.771e-02, 2.147e-01, 6.493e-01, 4.989e-01, 3.955e-01, -1.017e-01, -2.861e-01, 3.878e-01, -6.653e-01, -4.968e-01, -5.063e-01), r);
	r = MulAdd(s0_5, M4(-2.270e-01, -3.965e-01, -2.794e-02, 1.487e-01, -2.667e-01, -1.410e-02, 1.475e-01, -4.992e-01, -1.071e-01, 2.096e-01, 1.159e-01, -6.073e-02, -7.157e-02, -2.446e-01, -4.807e-02, 1.968e-01), r);
	r = MulAdd(s0_6, M4(8.199e-02, 8.336e-02, -3.090e-02, -1.287e-02, -6.954e-02, -7.544e-02, 1.272e-01, 7.930e-02, -3.647e-02, -2.685e-02, -4.235e-02, 3.214e-02, -4.526e-02, 1.479e-01, -4.963e-02, -3.035e-02), r);
	r = MulAdd(s0_7, M4(-2.012e-02, -1.497e-02, -2.952e-01, -6.026e-02, 2.135e-03, 2.979e-02, -2.713e-02, 7.951e-03, -8.069e-02, -2.374e-01, 1.865e-01, 1.048e-01, -9.076e-02, 6.683e-02, 9.576e-02, -2.432e-02), r);
	r = MulAdd(s0_8, M4(1.455e-01, 2.613e-01, -1.616e-01, -3.564e-01, 1.229e-01, -3.778e-02, 3.316e-02, 5.927e-02, -1.831e-01, -1.388e-01, 5.986e-02, 2.083e-02, -1.368e-03, 2.394e-01, -1.623e-01, -2.768e-02), r);
	r = MulAdd(s1_0, M4(7.711e-03, -6.696e-04, -3.229e-02, 1.549e-02, -1.596e-01, 2.068e-01, -6.162e-02, -9.571e-02, -1.500e-01, 1.743e-01, 2.746e-02, -5.845e-02, -7.649e-03, -4.265e-03, 4.154e-03, 3.950e-03), r);
	r = MulAdd(s1_1, M4(2.764e-01, -4.505e-02, 4.280e-02, 6.044e-02, 3.396e-02, 2.750e-01, -1.910e-01, -2.153e-01, 9.633e-02, -2.194e-02, -2.131e-01, -1.181e-01, -1.343e-01, 6.123e-02, 1.904e-02, -6.568e-02), r);
	r = MulAdd(s1_2, M4(-3.643e-01, -1.709e-02, 1.528e-01, -1.405e-01, 3.307e-01, -1.979e-03, -1.819e-01, 7.635e-02, 1.266e-01, 2.162e-01, -7.492e-02, -9.075e-02, 4.120e-02, 1.521e-01, -2.790e-03, -4.330e-02), r);
	r = MulAdd(s1_3, M4(1.913e-02, -5.373e-02, 5.748e-02, -1.443e-02, -2.776e-01, -1.162e-01, -1.994e-01, 1.430e-01, 9.058e-02, -3.720e-02, -3.585e-02, -8.516e-02, -2.228e-02, 7.507e-02, -9.620e-02, -1.013e-01), r);
	r = MulAdd(s1_4, M4(-3.592e-01, 1.415e-01, 1.018e+00, -1.555e-01, 5.378e-01, 8.818e-02, 2.190e-01, 1.997e-01, -1.128e-01, 3.331e-02, -1.410e-01, 2.844e-01, 4.756e-01, -5.850e-02, -3.757e-01, -1.716e-01), r);
	r = MulAdd(s1_5, M4(2.636e-02, -3.596e-01, -3.280e-01, 2.027e-01, 3.000e-01, -2.297e-01, 4.282e-02, 1.776e-01, 5.222e-02, 1.751e-01, 4.529e-02, -8.347e-02, -3.409e-01, -2.640e-01, 1.753e-01, -5.672e-01), r);
	r = MulAdd(s1_6, M4(-1.699e-02, 4.941e-02, -2.642e-02, -1.406e-04, -1.655e-01, -1.464e-02, -4.353e-02, 1.946e-01, 6.067e-02, -1.429e-01, 1.170e-01, -4.644e-02, -6.567e-02, -2.264e-02, 6.666e-02, 9.009e-02), r);
	r = MulAdd(s1_7, M4(7.805e-02, 2.173e-02, -3.276e-01, 2.004e-03, -7.789e-02, -1.466e-02, -1.560e-01, -1.126e-01, -3.823e-02, -2.446e-03, 1.465e-01, -2.744e-01, -2.129e-01, -2.141e-02, 4.456e-01, 1.240e-01), r);
	r = MulAdd(s1_8, M4(1.315e-02, 2.686e-01, -1.987e-01, -2.093e-01, 3.184e-02, -8.723e-02, 3.012e-01, 3.580e-01, 1.198e-02, -2.655e-01, 1.455e-01, 7.602e-02, -4.605e-02, 3.276e-01, -2.036e-01, -2.590e-01), r);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { 3.045e-03, 3.707e-03, -6.011e-03, -5.162e-03 };
	r = MulAdd(s0_0, M4(2.151e-02, -4.754e-02, 3.454e-02, -1.338e-03, -4.337e-02, 4.608e-02, -1.116e-01, -2.296e-02, -2.839e-02, -3.878e-01, -2.317e-02, 5.774e-02, 4.317e-03, 6.680e-02, 6.325e-02, -1.449e-01), r);
	r = MulAdd(s0_1, M4(-1.173e-01, -8.942e-02, -1.017e-01, 6.496e-02, 5.558e-02, 2.788e-02, 2.184e-02, -2.837e-03, -1.057e-01, -2.075e-01, -3.255e-02, -1.297e-02, -2.643e-02, -1.695e-02, -9.425e-02, 3.942e-02), r);
	r = MulAdd(s0_2, M4(-1.773e-02, -4.118e-02, -2.141e-02, 4.282e-02, 4.234e-02, -1.221e-02, -3.375e-03, 4.469e-02, -2.586e-01, -1.112e-01, -7.688e-02, 3.426e-02, 8.170e-02, -2.355e-02, -3.737e-02, 3.004e-02), r);
	r = MulAdd(s0_3, M4(2.192e-01, 1.955e+00, 2.012e-01, -2.598e-02, -7.453e-02, 5.510e-02, -1.517e-01, -2.571e-01, -2.182e-02, -2.345e-02, -5.767e-02, -5.534e-02, -1.996e-02, 2.329e-01, 4.447e-04, -1.111e-01), r);
	r = MulAdd(s0_4, M4(3.476e-01, -4.368e-01, -1.180e-01, 5.371e-01, 5.294e-01, 1.509e-01, 2.456e-01, -7.875e-02, 2.055e-01, 9.732e-02, 1.285e-01, 5.178e-01, 3.256e-01, -2.842e-01, 4.421e-02, 3.426e-01), r);
	r = MulAdd(s0_5, M4(6.119e-01, -1.393e-01, -1.144e-02, 2.438e-01, -5.126e-02, -1.049e-01, -7.847e-02, 9.942e-02, 5.371e-01, 9.985e-02, 9.193e-02, -3.067e-02, -1.962e-01, -4.272e-02, -7.821e-03, 2.557e-02), r);
	r = MulAdd(s0_6, M4(1.224e-02, -5.098e-01, 3.052e-01, 5.332e-01, 2.249e-01, 4.201e-02, 5.423e-01, 1.106e-01, -1.056e-02, -4.091e-03, -1.267e-02, -5.280e-02, 1.898e-02, 9.430e-03, 1.470e-02, 7.235e-02), r);
	r = MulAdd(s0_7, M4(-4.342e-01, 2.385e-01, -3.834e-02, -7.654e-02, -9.043e-01, -3.139e-01, -1.511e-01, 3.800e-01, -8.848e-02, -3.911e-02, -7.025e-03, -1.196e-02, -3.322e-03, -1.455e-01, 2.084e-02, 1.106e-01), r);
	r = MulAdd(s0_8, M4(1.382e-01, -1.894e-01, -8.814e-02, 1.373e-01, 1.362e-01, -1.298e-01, -1.007e-01, 1.166e-01, -1.553e-02, 8.530e-02, 2.744e-02, -1.083e-01, -5.606e-02, 5.965e-02, 1.406e-02, -4.496e-02), r);
	r = MulAdd(s1_0, M4(-4.828e-03, -1.035e-01, -5.021e-02, 1.972e-02, -9.942e-03, -3.057e-01, -7.373e-03, 4.274e-02, -3.475e-03, 4.653e-02, 9.115e-03, -5.794e-02, 1.170e-02, 1.322e-01, 1.195e-01, -2.535e-02), r);
	r = MulAdd(s1_1, M4(-5.424e-02, -1.541e-01, -9.945e-02, 8.862e-02, -1.198e-01, -3.591e-05, 4.305e-02, -1.079e-01, 1.605e-02, -3.377e-02, -5.398e-02, 1.201e-02, 3.432e-02, 1.090e-02, 8.871e-02, 3.186e-02), r);
	r = MulAdd(s1_2, M4(-1.108e-01, -3.481e-02, -1.616e-02, -4.136e-03, -3.382e-02, 1.836e-02, -3.071e-02, -3.186e-02, -1.014e-01, -1.412e-01, -7.790e-02, 9.763e-02, -1.624e-02, -2.520e-02, -2.152e-02, 2.524e-02), r);
	r = MulAdd(s1_3, M4(3.337e-03, -1.439e-02, 2.317e-03, 2.097e-01, 5.091e-03, 4.138e-02, -5.988e-02, -2.348e-02, -5.626e-03, 1.695e-02, 2.371e-02, -1.652e-02, 8.541e-02, -1.851e-01, 1.130e+00, -1.181e-01), r);
	r = MulAdd(s1_4, M4(1.184e-01, -3.385e-02, 2.659e-02, 3.233e-01, 2.333e-01, 1.694e-01, 1.915e-01, 1.162e-01, 4.309e-02, -3.793e-02, 1.412e-01, -1.345e-02, -6.074e-01, -2.408e-01, -1.306e-01, 1.033e-01), r);
	r = MulAdd(s1_5, M4(3.452e-01, 1.401e-01, 3.650e-02, -4.950e-02, 1.755e-01, -1.210e-01, -1.041e-02, 1.281e-01, 4.262e-01, 2.166e-02, 3.851e-02, 1.295e-01, -1.910e-01, -2.029e-02, -2.151e-02, -1.537e-02), r);
	r = MulAdd(s1_6, M4(4.989e-03, -5.730e-02, 5.803e-02, 2.946e-02, 1.825e-02, 2.660e-02, -4.900e-03, 3.848e-03, 1.078e-02, 1.823e-02, -4.751e-03, 4.219e-02, -1.024e-01, 7.721e-02, -6.709e-01, 8.423e-02), r);
	r = MulAdd(s1_7, M4(-1.567e-01, 4.125e-02, -2.721e-02, -1.831e-01, 9.470e-03, -1.205e-01, 1.793e-02, 1.160e-01, -4.874e-02, -4.902e-02, -1.479e-01, 7.102e-02, 6.699e-01, -1.383e-01, 1.314e-01, 2.999e-01), r);
	r = MulAdd(s1_8, M4(-2.625e-01, -9.735e-02, -6.038e-02, 3.588e-03, 2.247e-02, 4.993e-02, 1.171e-02, -2.071e-02, 2.066e-01, 2.852e-01, -5.781e-02, -3.231e-01, 6.922e-02, 8.960e-02, 9.107e-02, -2.880e-02), r);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 5
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t1
//!OUT OUTPUT

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = { 1.708e-05, 2.435e-04, 1.267e-03, 1.926e-03 };
	r = MulAdd(s0_0, M4(1.116e-01, 1.402e-01, 1.439e-02, 5.091e-02, -1.526e-02, -2.562e-02, -1.193e-02, -1.365e-02, -6.156e-02, -3.463e-02, 2.155e-02, -2.192e-02, -2.937e-02, -1.072e-01, -4.538e-02, -3.302e-02), r);
	r = MulAdd(s0_1, M4(-1.192e-02, -1.724e-02, 9.899e-03, -5.861e-03, -1.552e-02, 2.422e-02, 4.929e-03, 7.339e-03, 4.700e-02, 1.993e-01, -6.323e-02, 5.778e-02, 1.499e-01, 3.916e-01, -4.578e-02, -2.026e-02), r);
	r = MulAdd(s0_2, M4(5.431e-03, 1.916e-03, -2.064e-03, -6.545e-04, -1.731e-02, -8.081e-02, 1.391e-02, -7.036e-03, 7.739e-02, -1.588e-01, 2.970e-02, 3.357e-02, 3.869e-02, -7.824e-02, 1.813e-02, -6.252e-02), r);
	r = MulAdd(s0_3, M4(5.283e-01, 8.076e-02, 3.430e-01, 2.332e-01, -3.540e-02, 1.903e-02, -1.354e-02, -1.415e-02, -1.644e-01, -1.319e-02, -9.781e-02, -3.256e-02, 2.768e-02, -3.914e-02, 1.596e-01, -1.067e-01), r);
	r = MulAdd(s0_4, M4(-1.638e-02, 4.385e-01, -1.479e-01, -1.789e-02, -1.399e-01, -5.884e-02, -7.306e-02, -2.036e-03, 5.196e-01, -1.849e-01, 8.771e-01, 3.595e-01, -7.094e-01, 2.485e-02, -3.977e-02, 7.246e-01), r);
	r = MulAdd(s0_5, M4(-1.647e-03, -6.027e-03, -3.787e-03, -1.975e-02, -4.810e-02, -4.557e-01, 4.921e-02, -1.313e-01, -2.044e-02, 3.533e-01, -7.591e-02, 1.249e-02, 2.648e-02, -5.215e-01, 1.204e-01, -2.254e-01), r);
	r = MulAdd(s0_6, M4(-2.852e-02, -1.630e-02, 1.249e-01, -1.758e-02, 4.285e-02, 1.425e-02, -1.595e-02, 2.618e-02, 4.460e-03, 1.266e-02, -3.914e-02, 1.111e-02, 5.378e-02, 2.199e-02, 2.561e-03, 2.125e-02), r);
	r = MulAdd(s0_7, M4(-6.567e-02, -4.333e-02, -4.153e-03, 1.692e-01, 5.376e-02, 5.736e-02, -1.860e-01, -9.094e-02, 3.357e-02, -3.186e-02, 1.244e-01, -9.606e-02, 6.227e-02, 6.827e-02, -2.086e-01, -6.625e-02), r);
	r = MulAdd(s0_8, M4(4.553e-05, -3.116e-02, 1.023e-02, 2.322e-02, 8.623e-02, 1.125e-01, 2.802e-02, -2.768e-01, -1.003e-01, -2.143e-02, -2.413e-02, 1.460e-01, 5.421e-02, 5.798e-02, 3.478e-03, -1.421e-01), r);
	r = MulAdd(s1_0, M4(2.165e-01, 1.123e-01, -3.653e-02, -6.070e-03, -1.021e-01, -6.901e-04, 6.256e-03, -3.182e-03, -4.285e-02, -6.763e-02, 2.278e-02, -1.860e-02, -2.689e-02, 2.567e-02, 2.634e-03, 3.600e-02), r);
	r = MulAdd(s1_1, M4(-1.159e-01, -1.198e-01, 2.991e-02, -6.143e-02, 1.038e-01, -5.076e-02, -1.785e-02, -3.611e-02, 6.860e-02, 9.302e-02, -1.125e-02, 3.332e-02, 6.457e-02, -3.919e-02, 4.158e-03, -1.201e-02), r);
	r = MulAdd(s1_2, M4(-6.554e-03, 3.359e-02, -2.003e-02, -2.227e-04, 3.354e-02, -3.700e-02, -9.588e-03, -3.740e-02, -1.336e-02, -2.556e-04, -4.733e-03, -1.636e-02, 1.127e-02, 1.421e-02, -1.019e-02, -2.731e-02), r);
	r = MulAdd(s1_3, M4(3.642e-01, -3.756e-03, 6.584e-01, 1.773e-01, -1.638e-02, 1.109e-02, -7.427e-02, -1.572e-02, -1.869e-01, -3.059e-02, -8.088e-02, -5.092e-02, -5.794e-02, -4.431e-02, -7.912e-02, -9.767e-02), r);
	r = MulAdd(s1_4, M4(-3.255e-02, 3.115e-01, -2.109e-01, 2.804e-01, -6.504e-01, -1.342e-02, 1.355e-01, 3.623e-01, 5.142e-01, 2.124e-01, 1.866e-01, 2.268e-01, -2.470e-02, 1.629e-01, 1.163e-01, 1.663e-01), r);
	r = MulAdd(s1_5, M4(-1.093e-02, -1.640e-04, -3.502e-02, -3.746e-02, 1.836e-02, -5.959e-01, 1.323e-01, -2.388e-01, 3.482e-02, 1.823e-01, -3.895e-02, 5.164e-03, -7.314e-02, -3.897e-01, 6.275e-02, -3.974e-02), r);
	r = MulAdd(s1_6, M4(7.922e-03, -3.284e-02, 1.274e-01, -2.930e-02, 6.307e-02, 2.548e-02, -4.094e-02, 2.130e-02, -1.123e-02, 1.824e-03, -9.595e-02, 1.808e-02, 7.955e-02, 3.285e-02, 4.592e-02, 7.153e-02), r);
	r = MulAdd(s1_7, M4(-6.410e-02, -1.423e-02, -4.912e-02, 1.461e-01, 6.612e-02, 9.838e-02, -2.153e-01, -1.067e-01, -1.108e-02, -1.048e-01, 2.778e-01, -1.116e-01, 4.569e-02, 2.955e-02, -1.440e-01, -3.364e-02), r);
	r = MulAdd(s1_8, M4(1.721e-02, 1.171e-02, 1.096e-02, -2.832e-02, 7.446e-02, 4.785e-02, 8.270e-03, -1.640e-01, -8.912e-02, -6.617e-02, 3.225e-03, 9.894e-02, 4.367e-02, 8.102e-02, -1.779e-02, -2.410e-01), r);
	return tanh(r);
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);

	static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
	static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
